00_all.qmd

01_load

Load data

read_tsv("./../data/_raw/raw.tsv",
         show_col_types = FALSE) |>
  write_tsv("../data/01_dat_load.tsv")



02_clean

Load Data

full_data <- read_tsv("./../data/01_dat_load.tsv",
                      show_col_types = FALSE)

Split Data

We have extracted one column (sample_tags) that contains comma-separated values into its own dataframe. This allows us to perform separate cleaning operations on it while showcasing our proficiency in joining dataframes.

patient_data <- full_data |>
  select(sample_name, sample_tags)

meta_data <- full_data |>
  select(!sample_tags)

Clean Data

patient_data <- patient_data |>
  mutate(
    case_control = case_when(
    str_detect(string = sample_tags,
               pattern = "Case") ~ "Case",
    str_detect(string = sample_tags,
               pattern = "Control") ~ "Control",
    TRUE ~ NA),
    
    id = case_when(
      str_detect(string = sample_tags,
                 pattern = "Subject") ~ as.double(
                   str_extract(string = sample_tags,
                               pattern = "(?<=Subject[^,])\\d+")),
      
      str_detect(string = sample_tags,
                 pattern = "Control") ~ as.double(
                   str_extract(string = sample_tags,
                               pattern = "(?<=Control[^,])\\d+")),
      TRUE ~ NA),
    
    timepoint = as.double(
      str_extract(string = sample_tags,
                  pattern = "(?<=Timepoint)\\s+\\d+[.\\d+]?")),
    
    gender = case_when(str_detect(string = sample_tags,
                                  pattern = "Female") ~ "Female",
                       str_detect(string = sample_tags,
                                  pattern = "Male") ~ "Male"),
    
    ethnicity = case_when(str_detect(string = sample_tags,
                                     pattern = "Non-Hispanic") ~ "Non-Hispanic",
                          str_detect(string = sample_tags,
                                     pattern = "Hispanic") ~ "Hispanic",
                          TRUE ~ as.character(sample_tags)),
    
    race = case_when(str_detect(string = sample_tags,
                                pattern = "Caucasian") ~ "Caucasian",
                     str_detect(string = sample_tags,
                                pattern = "Asian") ~ "Asian",
                     TRUE ~ as.character(sample_tags)),
    
    age = round(if_else(
                      is.na(as.double(str_extract(string = sample_tags,
                                                  pattern = "\\d+(?=\\s+(Years)\\,)"))),
                      (1/12)*as.double(str_extract(string = sample_tags,
                                                   pattern = "\\d+(?=\\s+(Months)\\,)")),
                      as.double(str_extract(string = sample_tags,
                                            pattern = "\\d+(?=\\s+(Years)\\,)"))),
                digits = 2),
    
    age_diagnosis = as.double(
      str_extract(string = sample_tags,
                  pattern = "\\d*\\.\\d+(?=\\sYears\\sat\\sdiagnosis)")),
    
    age_visit = as.double(
      str_extract(string = sample_tags,
                  pattern = "\\d+(?=\\sYears\\sat\\svisit)")),
    
    GAD65 = as.double(
      str_extract(string = sample_tags,
                  pattern = "(?<=GAD65\\s)\\d+")),
    
    IA_2 = as.double(
      str_extract(string = sample_tags,
                  pattern = "(?<=IA\\-2\\s)\\d+")),
    
    IAA = as.double(
      str_extract(string = sample_tags,
                  pattern = "(?<=IAA )[-+]?[0-9]*\\.?[0-9]+")),
    
    ZnT8 = as.double(
      str_extract(string = sample_tags,
                  pattern = "(?<=ZnT8\\s)\\d+\\.\\d+"))
        
    ) |> 
  
  mutate(
    HLA_A = str_extract_all(string = sample_tags,
                            pattern = "(?<=HLA\\-{1}A\\*{1})\\d{4}"),

    HLA_B = str_extract_all(string = sample_tags,
                            pattern = "(?<=HLA\\-{1}B\\*{1})\\d{4}"),
      
    HLA_C = str_extract_all(string = sample_tags,
                            pattern = "(?<=HLA\\-{1}C\\*{1})\\d{4}"),
      
    HLA_DPA1 = str_extract_all(string = sample_tags,
                               pattern = "(?<=HLA\\-{1}DPA1\\*{1})\\d{4}"),
      
    HLA_DPB1 = str_extract_all(string = sample_tags,
                               pattern = "(?<=HLA\\-{1}DPB1\\*{1})\\d{4}"),
      
    HLA_DQA1 = str_extract_all(string = sample_tags,
                               pattern = "(?<=HLA\\-{1}DQA1\\*{1})\\d{4}"),
      
    HLA_DQB1 = str_extract_all(string = sample_tags,
                               pattern = "(?<=HLA\\-{1}DQB1\\*{1})\\d{4}"),
      
    HLA_DRB1 = str_extract_all(string = sample_tags,
                               pattern = "(?<=HLA\\-{1}A\\*{1})\\d{4}")
    
    )  |>
         
  unnest_wider(col = starts_with("HLA_"),
               names_sep = ",") |> 
  select(!sample_tags)

Join Data

full_data <- meta_data |>
  inner_join(patient_data,
             by = "sample_name")

Write Data

full_data |>
  write_tsv("./../data/02_dat_clean.tsv")



03_augment

Select data

read_tsv("./../data/02_dat_clean.tsv",
         show_col_types = FALSE) |>
  filter(!str_detect(string = sample_name,
                     pattern = "Denver")) |> 
  arrange(case_control, id, timepoint) |> 
  pivot_longer(cols = c(GAD65, ZnT8, IA_2, IAA),
               names_to = "antibody", 
               values_to = "expression") |> 
  pivot_longer(starts_with('HLA_'), 
               names_to = "allele",
               values_to = "subtype") |>
  mutate(allele = str_remove_all(string = allele,
                                 pattern = "(\\,\\d)")) |>
  write_tsv("./../data/03_dat_aug.tsv")



04_describe

Load Data

#|message: false
data <- read_tsv("./../data/03_dat_aug.tsv",
                 show_col_types = FALSE)

Data Description

This dataset comprises a comprehensive registry detailing immunological and demographic characteristics, encompassing 216 observations across 41 variables. It captures data from 54 (29 cases and 25 control) patients across four distinct visits, including demographic details like gender, ethnicity, race, and age at diagnosis and visit. It encompasses key immunological markers such as GAD65, IA_2, IAA, ZnT8 antibodies, along with extensive HLA allele information (HLA_A, HLA_B, HLA_C, HLA_DPA1, HLA_DPB1, HLA_DQA1, HLA_DQB1, HLA_DRB1). Additionally, it contains metrics (e.g., total_templates, total_rearrangements, productive_templates) about the T-cell receptor β region, crucial for diversifying T-cell receptors and recognizing antigens. This dataset elucidates the intricate relationship between demographic factors and immunological profiles across multiple patient visits, providing invaluable insights for in-depth analysis and comprehension. Refer to the complete list of variables provided below.

Variables

data |>
  tbl_vars()
<dplyr:::vars>
 [1] "sample_name"                  "total_templates"             
 [3] "productive_templates"         "fraction_productive"         
 [5] "total_rearrangements"         "productive_rearrangements"   
 [7] "productive_simpson_clonality" "max_productive_frequency"    
 [9] "locus"                        "sku"                         
[11] "test_name"                    "case_control"                
[13] "id"                           "timepoint"                   
[15] "gender"                       "ethnicity"                   
[17] "race"                         "age"                         
[19] "age_diagnosis"                "age_visit"                   
[21] "antibody"                     "expression"                  
[23] "allele"                       "subtype"                     

Age

data |> 
  drop_na(gender, age, case_control) |>
  select(gender, age, case_control, id, timepoint) |> 
  distinct() |> 
  mutate(age = case_when(age < 5 ~ factor("(0,5]"),
                         age < 10 ~ factor("(5,10]"),
                         age < 15 ~ factor("(10,15]"),
                         age < 20 ~ factor("(15,20]"),
                         age < 25 ~ factor("(20,25]"))) |> 
  filter(!is.na(age)) |> 
  ggplot(mapping = aes(x = age,
                       fill = case_control)) +
  geom_bar(alpha = 0.8,
           position = position_dodge(preserve = "single"),
           color = "black") +
  theme_bw() +
  labs(title = "Age group stratified by case/control per timepoint",
       x = "Age",
       y = "Count",
       fill = "Case/Control") +
  scale_fill_manual(values = c(Case = "#481567",
                               Control = "#20A387")) +
  facet_wrap(~timepoint) +
  theme(legend.position = "bottom")

ggsave(width = 8,
       height = 3,
       filename = "04_plot_1.png",
       path = "./../results/")

data |>
  drop_na(age_diagnosis) |>
  ggplot(data = _,
         mapping = aes(x = age_diagnosis)) +
  geom_boxplot(color = "#481567",
               fill = "#20A387",
               alpha = 0.8,
               size = 0.5) +
  theme_bw() +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank()) +
  labs(title = "Age at diagnosis",
       x = '')

ggsave(width = 8,
       height = 3,
       filename = "04_plot_2.png",
       path = "./../results/")

The age distribution of patients predominantly falls within the 0-5 age group. Notably, more than half were diagnosed between ages 10 and 15. The participants underwent testing from early childhood into their twenties. Among the 29 individuals, only 5 received diagnoses after age 15, while 24 were diagnosed before reaching that age. Given the data-set is limited size of 29 observations, the analysis results might diverge from the actual pattern.

TCR-Beta diversity

Total rearrangement

data |>
  group_by(case_control, age_visit, timepoint) |>
  summarize(total_temp_mu = log(mean(total_rearrangements))) |>
  ggplot(data = _,
         mapping = aes(x = age_visit,
                       y = total_temp_mu)) +
  geom_point(mapping = aes(color = timepoint)) +
  labs(title = "Mean of Total Rearrangements vs. Age at visit",
       subtitle = "Faceted by outcome and stratified by timepoint",
       x = "Age",
       y = "log( Total Rearrangement )",
       color = "Timepoint") +
  scale_color_viridis() +
  geom_smooth(method = "lm",
              color = "black") +
  facet_grid(vars(case_control)) +
  theme_bw()

ggsave(width = 8,
       height = 6,
       filename = "04_plot_3.png",
       path = "./../results/")

Productive rearrangement

data |>
  group_by(case_control, age_visit, timepoint) |>
  summarize(prod_temp_mu = log(mean(productive_rearrangements))) |>
  ggplot(data = _,
         mapping = aes(x = age_visit,
                       y = prod_temp_mu)) +
  geom_point(mapping = aes(color = timepoint)) +
  labs(title = "Mean of Productive Rearrangements vs. Age at visit",
       subtitle = "Faceted by outcome and stratified by timepoint",
       x = "Age",
       y = "log( Productive Rearrangement )",
       color = "Timepoint") +
  scale_color_viridis() +
  geom_smooth(method = "lm",
              color = "black") +
  facet_grid(vars(case_control)) +
  theme_bw()

ggsave(width = 8,
       height = 6,
       filename = "04_plot_4.png",
       path = "./../results/")

We see a slight rise in total and productive rearrangements for case patients and a decrease for control patients, when comparing the values against age at visit. So the number of rearrangements seems to be stable in case patients during aging, however there is decreasing rearrangement for the control patients.

Fraction productive rearrangement

data |>
  group_by(case_control,
           age_visit,
           timepoint) |>
  summarize(frac_prod = mean(productive_rearrangements) / mean(total_rearrangements)) |>
  ggplot(data = _,
         mapping = aes(x = age_visit,
                       y = frac_prod)) +
  geom_point(mapping = aes(color = timepoint)) +
  labs(title = "Fraction of Productive Rearrangements vs. Age at visit",
       subtitle = "Faceted by outcome and stratified by timepoint",
       x = "Age",
       y = "Fraction of Productive Rearrangements",
       color = "Timepoint") +
  scale_color_viridis() +
  geom_smooth(method = "lm",
              color = "black") +
  facet_grid(vars(case_control)) +
  theme_bw()

ggsave(width = 8,
       height = 6,
       filename = "04_plot_5.png",
       path = "./../results/")

We see a steep rise in the fraction of productive rearrangement with age (regardless of outcome for patients), which must mean that the differences in values between productive rearrangement and total rearrangement become smaller with age.

Productive Simpson Clonality

data |> 
  ggplot(data = _,
         mapping = aes(x = age_visit,
                       y = log(productive_simpson_clonality))) +
  geom_point(mapping = aes(color = timepoint)) +
  labs(title = "Productive Simpson Clonality vs. Age",
       subtitle = "Faceted by outcome and stratified by timepoint",
       x = "Age",
       y = "log( Productive Simpson Clonality )") +
  scale_color_viridis() +
  geom_smooth(method = "lm",
              color = "black") +
  facet_grid(vars(case_control)) +
  theme_bw()

ggsave(width = 8,
       height = 6,
       filename = "04_plot_6.png",
       path = "./../results/")

Both case and control show a direct proportionality between productive simpson clonality and age, which shows the decrease of polyclonality with age.



05_antibody_expression_analysis

Load Data

data <- read_tsv("./../data/03_dat_aug.tsv",
                 show_col_types = FALSE)

Data Analysis

data |>
  group_by(antibody, timepoint, case_control) |>
  summarise(mean_expression = mean(expression, 
                                   na.rm = TRUE),
            .groups = "drop") |>
  ggplot(mapping = aes(x = factor(timepoint),
                       y = mean_expression,
                       fill = case_control)) +
  geom_bar(stat = "identity",
           position = "dodge",
           alpha = 0.8,
           color = 'black') +
  facet_wrap(vars(antibody),
             scales = "free_y",
             nrow = 2) +
  labs(title = "Antibody expression across visits stratified by case and control",
       x = "Timepoint",
       y = "Mean Expression",
       fill = "Case or Control") +
  scale_fill_manual(values = c(Case = "#481567",
                               Control = "#20A387")) +
  theme_bw() + 
  theme(legend.position = "bottom")

ggsave(width = 8,
       height = 6,
       filename = "05_plot_1.png",
       path = "./../results/")



06_pca_antibody

PCA handling and plotting

conversion_table <- tibble(
  case_control = c("Case", "Control"),
  num_case_control = c(1, 0)
)

data <- read_tsv("./../data/03_dat_aug.tsv",
                 show_col_types = FALSE) |>
  pivot_wider(names_from = "antibody",
              values_from = "expression") |>
  select(case_control, GAD65, IA_2, IAA, ZnT8) |>
  mutate(across(starts_with("GAD65"), 
                ~ if_else(is.na(.), 0.001, .)), 
         across(starts_with("IA_2"), 
                ~ if_else(is.na(.), 0.001, .)), 
         across(starts_with("IAA"), 
                ~ if_else(is.na(.), 0.001, .)), 
         across(starts_with("ZnT8"), 
                ~ if_else(is.na(.), 0.001, .))) |> 
  left_join(conversion_table, 
            by = "case_control")

data_pca <- data |>
  select(-case_control,
         -num_case_control)|>
  scale() |>
  prcomp()

data_pca |>
  augment(data) |>
  ggplot(data = _,
         mapping = aes(x = .fittedPC1, 
                       y = .fittedPC2,
                       color = case_control)) + 
  geom_point(size = 1.5) +
  scale_color_manual(values = c(Case = "#481567",
                                Control = "#20A387")) +
  theme_bw() +
  labs(title = "Plot of individuals",
       color = "Case vs Control")

ggsave(width = 8,
       height = 5,
       filename = "06_plot_1.png",
       path = "./../results/")

data_pca |>
  tidy(matrix = "eigenvalues") |>
  ggplot(mapping = aes(x = PC,
                       y = percent)) +
  geom_col(fill = "#20A387", 
           alpha = 0.8) + 
  scale_x_continuous(breaks = 1:9) +
  scale_y_continuous(labels = scales::percent_format(),
                     expand = expansion(mult = c(0, 0.01))) +
  geom_line(mapping = aes(x = PC),
            color = "#481567") +
  theme_bw() +
  labs(title = "Explained Variance by Principal Components",
       y = "Percentage Contribution")

ggsave(width = 8,
       height = 5,
       filename = "06_plot_2.png",
       path = "./../results/")

arrow_style <- arrow(length = unit(0.2, "cm"), 
                     ends = "first")

num_case_control_coord <- cor(data$num_case_control, data_pca$x) |>
  as_tibble()

data_pca |>
  tidy(matrix = "rotation") |>
  pivot_wider(names_from = "PC", 
              names_prefix = "PC", 
              values_from = "value") |>
  ggplot(mapping = aes(x = PC2,
                       y = PC1)) +
  geom_point(alpha = 0) + 
  coord_flip() +
  theme_bw() +
  labs(title = "Rotation Matrix",
       subtitle = "Exploring Variability Along Principal Components") +
  geom_segment(mapping = aes(xend = 0,
                             yend = 0),
               arrow = arrow_style,
               color = "#481567") +
  geom_segment(data = num_case_control_coord,
               mapping = aes(xend = 0,
                             yend = 0),
               arrow = arrow_style,
               color = "#20A387") +
  geom_label_repel(data = num_case_control_coord,
                  mapping = aes(label = "case_control"),
                  color = "#20A387",
                  size = 3.5) +
  geom_label_repel(mapping = aes(label = column),
                  color = "#481567",
                  size = 3.5)

ggsave(width = 8,
       height = 5,
       filename = "06_plot_3.png",
       path = "./../results/")



07_HLA_allele_analysis

data <- read_tsv("./../data/03_dat_aug.tsv",
                 show_col_types = FALSE)

HLA_tidy <- data |> 
  filter(timepoint == 4) |> 
  mutate(allele = str_remove(string = allele,
                             pattern = "(\\,1)|(\\,2)")) |>
  mutate(allele_subtype = str_c(allele,
                                subtype,
                                sep = ":")) |> 
  drop_na(allele_subtype) |> 
  mutate(subtype_n = 1) |>
  select(case_control, id, allele_subtype, subtype_n) |> 
  group_by(case_control, allele_subtype) |>
  summarize(allele_n = n(), .groups = "drop") |> 
  pivot_wider(names_from = case_control,
              values_from = allele_n) |> 
  na.omit(Control, Case) |> 
  mutate(ratio = round(Case/Control, 
                       digits = 2)) |>
  select(allele_subtype, ratio)

ggplot(data = HLA_tidy,
       mapping = aes(x = reorder(allele_subtype, +ratio),
                     y = ratio)) + 
  geom_bar(stat = "identity",
           color = "#20A387",
           fill = "#20A387",
           alpha = 0.8) + 
  coord_flip() + 
  labs(title = "Ratio of HLA subtype occurence for case and control patients",
       x = "Allele subtype",
       y = "Ratio (Case/Control)") + 
  theme_bw() + 
  theme(panel.grid.major.y = element_blank(),
        axis.text.x= element_text(size=10),
        axis.text.y= element_text(size=7))

ggsave(width = 10,
       height = 10,
       filename = "07_plot_2.png",
       path = "./../results/")
data <- read_tsv("./../data/03_dat_aug.tsv",
                 show_col_types = FALSE)

HLA_tidy <-
  HLA_tidy |>
  ungroup() |>
  mutate(allele = str_extract(string = allele_subtype,
                              pattern = "HLA[^\\:]+"),
         subtype = str_extract(string = allele_subtype, 
                               pattern = "\\d{4}"))


ggplot(data = HLA_tidy,
       mapping = aes(x = subtype,
                     y = allele)) +
  geom_tile(aes(fill = ratio),
            color="#481567") + 
  theme_bw()+
  theme(axis.text.y = element_text(size = 10),
        axis.text.x = element_text(angle = 90,
                                   size = 10,
                                   vjust=0.5),
        panel.grid.major.y = element_blank())+
  scale_fill_gradient(low = "#20A387",
                      high = "#481567") + 
  labs(title = "Ratio of HLA subtypes in case and control patients",
       x = "Subtype",
       y = "Allele",
       fill = "Case/Control")

ggsave(width = 8,
       height = 5,
       filename = "07_plot_1.png",
       path = "./../results/")




08_pca_HLA

Load Data

data <- read_tsv("./../data/03_dat_aug.tsv", 
                 show_col_types = FALSE)

Analysis

MCA HLA subtypes

Percentage of variance explained

HLA_tidy <- data |> 
  filter(timepoint == 1) |>
  select(case_control, id, allele, subtype) |>
  mutate(allele_subtype = str_c(allele,
                                subtype,
                                sep = ":")) |>
  na.omit(allele_subtype) |>
  group_by(case_control, id, allele_subtype) |>
  summarize(subtype_n = n(),
            .groups = "drop") |>
  mutate(subtype_n = str_replace(string = subtype_n,
                                 pattern = "4", "yes")) |>
  pivot_wider(names_from = allele_subtype,
              values_from = subtype_n) |>
  mutate(across(everything(),
                ~replace_na(.x, "no"))) |>
  mutate(across(starts_with("HLA"), as.factor)) |>
  mutate(patient = str_c(case_control,
                         id,
                         sep = ":"))



relevant_var <- unname(as_vector( HLA_tidy |>
                                    pivot_longer(cols = starts_with("HLA"),
                                                 names_to = "subtype",
                                                 values_to = "presence") |>
                                    select(subtype, 
                                           presence) |>
                                    filter(presence == "yes") |>
                                    group_by(subtype) |>
                                    summarize(freq = n()/54) |>
                                    ungroup() |>
                                    filter(freq >= 0.30) |>
                                    select(subtype) ))



HLA.active <- HLA_tidy |>
  select(all_of( relevant_var ))

mca_fit <- MCA(HLA.active, 
               graph = FALSE)



as_tibble_row(mca_fit) |>
  select(eig) |>
  unnest(eig) |>
  reduce("*") |>
  as_tibble() |>
  rename("eigenvalue" = 1,
         "var_percent" = 2,
         "cum_var_percent" = 3) |>
  mutate(Dim = str_c("Dim", seq(1,13)))|>
  ggplot(aes(x = reorder(Dim, - var_percent),
             y = var_percent)) +
  geom_col(fill = "#481567",
           alpha = 0.8) +
  geom_text(aes(label = round(var_percent,1)),
            size = 3,
            vjust = -0.3) +
  theme(axis.text.x = element_text(angle = 90),
        panel.grid.minor = element_blank()) +
  theme_bw() +
  labs(title = "Variance Explained by Each Dimension",
       x = "Dimension",
       y = "Percentage of Variance explained")

ggsave(width = 8,
       height = 5,
       filename = "08_plot_1.png",
       path = "./../results/")

Allele subtype category (present or not) representation

coordinates <- as_tibble_row(mca_fit) |>
  select(var) |>
  unnest_wider(var) |>
  select(coord)|>
  unnest(coord) |>
  reduce("*") |>
  as_tibble() |>
  select(1,2) |>
  rename("coord_Dim1" = 1,
         "coord_Dim2" = 2) |>
  mutate(id=seq(1,26))
  
cos2values <- as_tibble_row(mca_fit) |>
  select(var) |>
  unnest_wider(var) |>
  select(cos2)|>
  unnest(cos2) |>
  reduce("*") |>
  as_tibble() |>
  select(1, 2) |>
  rename("cos2_Dim1" = 1,
         "cos2_Dim2" = 2) |>
  mutate(id = seq(1,26))


full_join(coordinates,
          cos2values,
          by = join_by(id == id)) |>
  mutate(HLA = rownames(data.frame( mca_fit$var )),
         representation = cos2_Dim1 + cos2_Dim2) |>
  ggplot(data = _,
         mapping=aes(x = coord_Dim1,
                     y = coord_Dim2,
                     label = HLA)) +
  geom_vline(xintercept = 0,
             linetype = "dashed",
             linewidth = 0.2) +
  geom_hline(yintercept = 0,
             linetype = "dashed",
             linewidth = 0.2) +
  geom_point(aes(color = representation)) +
  geom_label_repel(aes(color = representation),
                   force_pull = 0,
                   size = 2,
                   nudge_y = 0.4,
                   direction = "y") +
  scale_color_gradient2(low="#20A387",
                        mid="#f68f46ff",
                        high="#481567",
                        midpoint=0.4) +
  theme_bw() +
  labs(title = "Dimension 2 vs. Dimension 1",
       subtitle = "Allele subtype Categories stratified by dimensional representation of category",
       x = "Dim 1",
       y = "Dim 2",
       color = "Cos2")

ggsave(width = 8,
       height = 5,
       filename = "08_plot_2.png",
       path = "./../results/")

Individuals in 2D.

as_tibble_row(mca_fit) |>
  select(ind) |>
  unnest_wider(ind) |>
  select(coord)|>
  unnest(coord) |>
  reduce("*") |>
  as_tibble() |>
  select(1, 2) |>
  rename("coord_Dim1" = 1,
         "coord_Dim2" = 2) |>
  mutate(case_control = HLA_tidy |> pull(case_control),
         patient = HLA_tidy |> pull(patient)) |>
  ggplot(data = _,
       mapping = aes(x = coord_Dim1,
                     y = coord_Dim2,
                     label = patient)) +
  geom_vline(xintercept = 0,
             linetype = "dashed",
             linewidth = 0.2) +
  geom_hline(yintercept = 0,
             linetype = "dashed",
             linewidth = 0.2) +
  geom_label_repel(mapping = aes(color = case_control),
                   size = 2,
                   max.overlaps = 15) +
  scale_color_manual(values = c(Case = "#481567",
                                Control = "#20A387")) +
  theme_bw() +
  labs(title = "Dimension 2 vs. Dimension 1",
       subtitle = "Patients stratified by outcome",
       x = "Dim 1",
       y = "Dim 2",
       color = "Case/Control")

ggsave(width = 8,
       height = 5,
       filename = "08_plot_3.png",
       path = "./../results/")

None of the dimensions explain enough of the variance themselves. The allele subtype category representation of dimension 1 and 2 show promise of investigative allele subtypes, however since the variance explained by the dimensions is low, no conclusion can be drawn. The patient representation gives less evidence of sufficient variance explained by dimension 1 and 2. A four dimensional explanation of variance could be investigated, as the first 4 dimensions explain a little over 70% of the variance.